/**************************************************************************************************************************************************************************************************************************************
Do file to scrape https://www.deutsche-digitale-bibliothek.de and save the citation data to german_newspaper_data_w_indicators.dta
***************************************************************************************************************************************************************************************************************************************
***************************************************************************************************************************************************************************************************************************************
Here are a few URLs to test the resulting output against:

"Karl Marx" for the 3 newspapers in 1933:
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1933&fromYear=1933&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Karl%20Marx%22
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1933&fromYear=1933&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Karl%20Marx%22&zdb_id=2814128-3
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1933&fromYear=193&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Karl%20Marx%22&zdb_id=3073896-9

"Herbert Spencer" in 1928.
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1928&fromYear=1928&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Herbert%20Spencer%22
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1928&fromYear=1928&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Herbert%20Spencer%22&zdb_id=2814128-3
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1928&fromYear=1928&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Herbert%20Spencer%22&zdb_id=3073896-9

"Adam Smith" in 1923.
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1923&fromYear=1923&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Adam%20Smith%22
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1923&fromYear=1923&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Adam%20Smith%22&zdb_id=2814128-3
https://www.deutsche-digitale-bibliothek.de/search/newspaper?fromDay=1&toYear=1923&fromYear=1923&toDay=31&toMonth=12&fromMonth=1&lang=en&query=%22Adam%20Smith%22&zdb_id=3073896-9

* Unconditional (blank query) in 1930.
https://www.deutsche-digitale-bibliothek.de/search/newspaper?query=&language=ger&lang=en&rows=1&fromDay=1&fromMonth=1&fromYear=1930&toDay=31&toMonth=12&toYear=1930
https://www.deutsche-digitale-bibliothek.de/search/newspaper?query=&language=ger&lang=en&rows=1&fromDay=1&fromMonth=1&fromYear=1930&toDay=31&toMonth=12&toYear=1930&zdb_id=2814128-3
https://www.deutsche-digitale-bibliothek.de/search/newspaper?query=&language=ger&lang=en&rows=1&fromDay=1&fromMonth=1&fromYear=1930&toDay=31&toMonth=12&toYear=1930&zdb_id=3073896-9
**************************************************************************************************************************************************************************************************************************************/

clear
version 18.0

* Preface each command with the local macro quietly, which has the value "quietly".
* This way, we can make our output noisy/verbose simply by replacing the string "quietly" with the null string.
local quietly = "quietly"

/*******************************************************************************
Program to convert an author's name into a form that can be embedded into a URL.
E.g., spaces must be converted into %20.
And e.g., "Ambroise Paré" fails when embedded in a URL bc of the é, so use "Ambroise%20Par%C3%A9" 
See https://www.url-encode-decode.com/
*******************************************************************************/
capture program drop ModifyNameForURL
program define ModifyNameForURL, rclass

	args Name 
		
	local NameForURL = "`Name'"
	
	* Add periods to initials
	if ("`NameForURL'" == "E T A Hoffmann") {
		local NameForURL = "E. T. A. Hoffmann"
	}
	else if ("`NameForURL'" == "John C Calhoun") {
		local NameForURL = "John C. Calhoun"
	}
	* Replace English names with German names - especially relevant for Greek and Latin authors
	else if ("`NameForURL'" == "Aristotle") {
		local NameForURL = "Aristoteles"
	}
	else if ("`NameForURL'" == "Plato") {
		local NameForURL = "Platon"
	}
	else if ("`NameForURL'" == "Dostoyevsky") {
		local NameForURL = "Dostojewski"
	}
	else if ("`NameForURL'" == "Thucydides") {
		local NameForURL = "Thukydides"
	}
	else if ("`NameForURL'" == "Augustine") {
		local NameForURL = "Augustinus"
	}
	else if ("`NameForURL'" == "Aquinas") {
		local NameForURL = "Aquin"
	}
	else if ("`NameForURL'" == "John Calvin") {
		local NameForURL = "Johannes Calvin"
	}
	else if ("`NameForURL'" == "Pericles") {
		local NameForURL = "Perikles"
	}
	else if ("`NameForURL'" == "Epictetus") {
		local NameForURL = "Epiktet"
	}
	else if ("`NameForURL'" == "Marcus Aurelius") {
		local NameForURL = "Mark Aurel"
	}
	else if ("`NameForURL'" == "Kempis") {
		local NameForURL = "Kempen"
	}
	else if ("`NameForURL'" == "Aeschylus") {
		local NameForURL = "Aischylos"
	}
	else if ("`NameForURL'" == "Sophocles") {
		local NameForURL = "Sophokles"
	}
	else if ("`NameForURL'" == "Pliny the Younger") {
		local NameForURL = "Plinius der Jüngere"
	}
	else if ("`NameForURL'" == "Virgil") {
		local NameForURL = "Vergil"
	}
	else if ("`NameForURL'" == "Herodotus") {
		local NameForURL = "Herodot"
	}
	else if ("`NameForURL'" == "Hippocrates") {
		local NameForURL = "Hippokrates"
	}
	else if ("`NameForURL'" == "Copernicus") {
		local NameForURL = "Kopernikus"
	}
	
	* Convert special characters into ASCII codes. E.g., space becomes %20. See https://www.url-encode-decode.com/
	local NameForURL = subinstr("`NameForURL'"," ","%20",999)
	local NameForURL = subinstr("`NameForURL'","é","%C3%A9",999)
	local NameForURL = subinstr("`NameForURL'","è","%C3%A8",999)
	local NameForURL = subinstr("`NameForURL'","ü","%C3%BC",999)
	local NameForURL = subinstr("`NameForURL'","ö","%C3%B6",999)
	local NameForURL = subinstr("`NameForURL'","ó","%C3%B3",999)
	local NameForURL = subinstr("`NameForURL'","ä","%C3%A4",999)
	
	* For a URL, the name must be wrapped with quotation marks represented as %22
	local NameForURL = "%22`NameForURL'%22"
	
	* Finally, if Name is an empty string, "", it means we want to get the total number of news phrases. 
	* So we want the URL to contain &query=&, not &query=""&
	* In this case we want NameForURL to also be an empty string.
	if ("`Name'" == "") {
		local NameForURL = ""
	}
		
	return local NameForURL = "`NameForURL'"

end

/*******************************************************************************
Program to scrape data for an arbitrary name and year and save it to the mata matrix "data"
*******************************************************************************/
capture program drop WebScrapeData 
program define WebScrapeData 

	args Name Year
	
	clear
	
	ModifyNameForURL "`Name'"
	local NameForURL = r(NameForURL)
	
	* The URL to web-scrape. Use the local macros / args Name and Year.
	* The language of newspapers is German, the lang (of the website) is English. 1 row of displayed output (if this were viewed in a web browser) to minimize unnecessary data transfer.
	*
	* Loop over 3 sources to affect the suffix to the URL: all newspapers, Vorwarts newspape, and Der Sozialdemokrat 
	foreach Source in "All" "Vorwarts" "DerSozial" {
		
		clear
		
		* Generate appropriate URL suffixes for each source 
		local Suffix = ""
		local CiteColumn = -999
		if ("`Source'" == "Vorwarts") {
			local Suffix = "&zdb_id=2814128-3"
		}
		else if ("`Source'" == "DerSozial") {
			local Suffix = "&zdb_id=3073896-9"
		}
	
		local URL = "https://www.deutsche-digitale-bibliothek.de/search/newspaper?query=`NameForURL'&language=ger&lang=en&rows=1&fromDay=1&fromMonth=1&fromYear=`Year'&toDay=31&toMonth=12&toYear=`Year'`Suffix'"
		
		* When Name is the empty string "", it treats it as if it were ".", and so NameForURL becomes "." too, and so we end up with ?query=.& even though we want ?query=&
		* So replace ?query=.& with ?query=&
		local URL = subinstr("`URL'","?query=.&","?query=&",999)	
			
		* Read the URL's HTML source into memory. Use "$" as a placeholder delimiter because there are no delimiters.
		* In a loop, repeat until successful.
		capture quietly this is a command that will fail and return _rc != 0 so that we enter the following loop
		while (_rc != 0) {
			* Sleep for a little bit to prevent server timeouts. In addition, make the sleep time be random, between 1 and 3 milliseconds
				* local sleepTime = runiform(1,3)
				* sleep `sleepTime'	
			capture `quietly' import delimited "`URL'", delimiter("$") bindquotes(nobind)	
		}
		
		* In our dataset, each observation is a line of HTML code.
		* We want to find the line that says <span class="results-count"> because the next line is the number of results.
		* So find the observation number of that line, then keep only the next observation after that.
		* Since our target string itself contains quotation marks, we wrap the string "xxx" in `'.
		`quietly' replace v1 = strtrim(v1)
		`quietly' generate obs_no = _n
		preserve 
		`quietly' keep if v1 == `"<span class="results-count">"' // substr(v1,2,10) == "span class" & substr(v1,14,13) == "results-count"
		local target_obs = obs_no[1] + 1
		restore 
		`quietly' keep if obs_no == `target_obs'
		`quietly' rename v1 Cite`Source'
		`quietly' drop obs_no 
		
		* Store Cite`Source' (variable) (either CiteAll, CiteVorwartz, or CiteDerSozial) in the appropriate matrix of the same name.
		tostring Cite`Source', replace
		putmata *, replace
	
	}
	
	clear
	
	* Create mata matrix CurrentData containing Name, Year, CiteAll, CiteVorwartz, and CiteDerSozial.
	* If Name is the empty string "", then create a placeholder name so the following code works. 
	* This is easier than creating a conditional in which we set the size of the matrix according to whether there is a name or not.
	* In the end, in this case, we will be dropping or not using the name anyway. So this is just to make the matrix conformable for now.
	if ("`Name'" == "" | "`Name'" == ".") {
		local Name = "NULL"
	}
	set obs 1
	`quietly' generate Name = "`Name'"
	`quietly' generate Year = "`Year'"
	`quietly' putmata CurrentData = (Name Year), replace 
	mata: CurrentData = CurrentData, CiteAll, CiteVorwarts, CiteDerSozial
	
	* Append CurrentData - with all 3 citation counts for given Name and Year - to Data.
	mata: Data = Data \ CurrentData
	
	/***************************************************************************
	* In case import delimited does not work to read data off the internet, here is code to use cURL.exe to read the URL, write it to the storage drive, and then read it back into Stata.
	* This is obviously awkward. It is even more awkward because of the fact that cURL.exe is not synced with Stata, so Stata will continue execution before cURL is finished writing the fail.
	* This requires some awkward work-arounds, as documented below.
		
		capture `quietly' erase "curl.txt"
		
		* Use winexec to execute a Windows program 
		* Use the Windows program cURL to download the webpage and save it to "curl.txt"
		* We write the trace (debugging log) because cURL is not synced with Stata and we don't know when cURL is finished writing curl.txt. 
		* Simply checking whether cURL.txt exists is not enough because Stata is able to read the file even when it is only partially written.
		* Instead, by checking whether the trace log exists, we can determine whether cURL is finished.
		winexec curl --output "curl.txt" --trace "trace.txt" --url "`URL'"
		
		* Check whether trace.txt exists by repeatedly trying to delete it until the delete is successful.
		local needToDeleteFile = 1
		while (`needToDeleteFile' == 1) {
			capture `quietly' erase "trace.txt"
			
			if (_rc == 0) {
				local needToDeleteFile = 0
			}
		}
		
		* Read curl.txt into Stata.
		* But winexec curl executes outside of Stata, so it will not be synced with Stata.
		* So when the next line of Stata code executes, curl may not have finished writing the file.
		* Therefore, in a loop, keep trying to read the file. Loop until successful.
		local needToReadFile = 1
		while (`needToReadFile' == 1) {
			capture noisily import delimited "curl.txt", delimiter("$") bindquotes(nobind)
			
			if (_rc == 0) {
				local needToReadFile = 0
			}
		}

		* Delete curl.txt, so that next time we run winexec curl and read curl.txt, we aren't reading curl.txt from a previous loop.
		* Curl would overwrite the file, but since curl and Stata aren't synced, we cannot know whether the currently existing file is new or old.
		* In a loop, keep trying to delete the file. Loop until successful.
		local needToDeleteFile = 1
		while (`needToDeleteFile' == 1) {
			capture `quietly' erase "curl.txt"
			
			if (_rc == 0) {
				local needToDeleteFile = 0
			}
		}
	***************************************************************************/
	
end

/*******************************************************************************
Import original JPE data to create a list of authors
*******************************************************************************/
clear 
cd ..
cd "data files"
use "all_authors_with_citations_and_indicators.dta"

* Create a dataset with one occurence of each author we wish to use
keep if Year == 1900
keep Name
drop if Name == "Marx" | Name == "Hegel" | Name == "August Bebel" | Name == "Eduard Bernstein"

* List all values of Name and save in the local macro AllAuthorNames
quietly levelsof Name, local(AllAuthorNames)

/*******************************************************************************
Now scrape the website for data
*******************************************************************************/

* Initialize the mata matrix "Data" with 0 rows, and 5 columns - 1 column each for author's name and year, and 3 for citation counts - 1 for unconditional counts, and 1 each for Vorwarts and Der Sozialdemokrat.
clear
mata: Data = J(0,5,"")

foreach Name of local AllAuthorNames {	
	
	display "`Name'"
	`quietly' ModifyNameForURL "`Name'"
	display r(NameForURL)
	
	forvalues Year = 1878(1)1932 {		
		display "`Year'" 
		`quietly' WebScrapeData "`Name'" "`Year'"
	}
}

* Convert the mata matrix Data into a Stata dataset 
clear
getmata (Name Year CiteAll CiteVorwarts CiteDerSozial) = Data
destring Year, replace
destring Cite*, replace ignore(",")
egen NameYear = concat(Name Year)
compress
save "german_newspaper_data.dta", replace

* Get total number of newspaper cites, i.e. unconditional, for each year. This is indicated by a query of "", so we pass "" as Name to WebScrapeData.
* We obtain unconditional cites for all newspapers, plus unconditional cites for Vorwarts and for Der Sozial.
* Now Data will still be 0x5, for Name, Year, CiteAll, CiteVorwarts, and CiteDerSozial.
clear
mata: Data = J(0,5,"")
display "No names, i.e. total cites for each year, unconditional."
forvalues Year = 1878(1)1932 {
	display "`Year'" 
	`quietly' WebScrapeData "" "`Year'"
}
clear
getmata (Name Year CiteAll CiteVorwarts CiteDerSozial) = Data
drop Name
destring Year, replace
destring Cite*, replace ignore(",")
* Rename variables to indicate they are unconditional cites, and then save.
rename CiteAll UnconditionalCiteAll 
rename CiteVorwarts UnconditionalCiteVorwarts 
rename CiteDerSozial UnconditionalCiteDerSozial
compress
save "german_newspaper_data_unconditional.dta", replace

* Merge the by-author-by-year citations with the unconditional by-year citations, by year.
clear
use "german_newspaper_data.dta"
merge m:1 Year using "german_newspaper_data_unconditional.dta"
drop _merge
* drop if missing(Name)

* Rename and label variables 
rename CiteAll 						NewsCite_AllNews
rename CiteVorwarts 				NewsCite_Vorwarts
rename CiteDerSozial 				NewsCite_DerSozial
rename UnconditionalCiteAll			NewsCite_Uncond_AllNews
rename UnconditionalCiteVorwarts	NewsCite_Uncond_Vorwarts 
rename UnconditionalCiteDerSozial	NewsCite_Uncond_DerSozial
label variable NewsCite_AllNews "German news cites - all newspapers in Deutsche Digitale Bibliothek"
label variable NewsCite_Vorwarts "German news cites - Vorwarts only"
label variable NewsCite_DerSozial "German news cites - Der Sozialdemokrat only"
label variable NewsCite_Uncond_AllNews "Germans news cites - total unconditional annual cites (not any phrase)"
label variable NewsCite_Uncond_Vorwarts "Germans news cites - Vorwarts-only unconditional annual cites (not any phrase)"
label variable NewsCite_Uncond_DerSozial "Germans news cites - Der Sozial-only unconditional annual cites (not any phrase)"

* Generate variables containing author-by-year citations normalized by total citations.
* Do this twice, once including Vorwarts and Der Sozial, and once excluding.
generate double NewsCite_TotalNorm = NewsCite_AllNews / NewsCite_Uncond_AllNews
generate double NewsCite_NetNorm = (NewsCite_AllNews - NewsCite_Vorwarts - NewsCite_DerSozial) / (NewsCite_Uncond_AllNews - NewsCite_Uncond_Vorwarts - NewsCite_Uncond_DerSozial)
label variable NewsCite_TotalNorm "Author citations divided by total unconditional citations, by year"
label variable NewsCite_NetNorm "Author citations divided by total unconditional citations, by year, exclud. Vorwarts and Der Sozial from numerator and denominator"

* Save data 
save "german_newspaper_data_cond_and_uncondit.dta", replace

* Merge with original data from JPE paper.
clear
use "all_authors_with_citations_and_indicators.dta"
egen NameYear = concat(Name Year)
merge 1:1 NameYear using "german_newspaper_data_cond_and_uncondit.dta"
* tabulate NameYear if (_merge == 1 & Year <= 1952)
* tabulate NameYear if (_merge == 2)
keep if _merge == 3
drop _merge 
drop NameYear 

save "german_newspaper_data_w_indicators.dta", replace

capture erase "german_newspaper_data.dta"
capture erase "german_newspaper_data_unconditional.dta"
capture erase "german_newspaper_data_cond_and_uncondit.dta"

clear